{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Users/haithamelmarakeby/PycharmProjects/pnet2'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "from os.path import dirname, realpath, join\n",
    "base_dir = dirname(dirname(os.getcwd()))\n",
    "import itertools\n",
    "import pandas as pd\n",
    "from os.path import join\n",
    "base_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "sys.path.insert(0, base_dir)\n",
    "from config_path import PROSTATE_DATA_PATH, PLOTS_PATH, GENE_PATH\n",
    "from data.data_access import Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Users/haithamelmarakeby/PycharmProjects/pnet2/_database/genes/hotspots_v2.xls'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hotspot_file= join(GENE_PATH, 'hotspots_v2.xls')\n",
    "hotspot_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "hotspots_df = pd.read_excel(hotspot_file, sheet_name='SNV-hotspots')\n",
    "hotspots_df_indel = pd.read_excel(hotspot_file, sheet_name='INDEL-hotspots')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Hugo_Symbol</th>\n",
       "      <th>Amino_Acid_Position</th>\n",
       "      <th>log10_pvalue</th>\n",
       "      <th>Mutation_Count</th>\n",
       "      <th>Reference_Amino_Acid</th>\n",
       "      <th>Total_Mutations_in_Gene</th>\n",
       "      <th>Median_Allele_Freq_Rank</th>\n",
       "      <th>Allele_Freq_Rank</th>\n",
       "      <th>SNP_ID</th>\n",
       "      <th>Variant_Amino_Acid</th>\n",
       "      <th>...</th>\n",
       "      <th>pad24entropy</th>\n",
       "      <th>pad36entropy</th>\n",
       "      <th>TP</th>\n",
       "      <th>reason</th>\n",
       "      <th>n_MSK</th>\n",
       "      <th>n_Retro</th>\n",
       "      <th>judgement</th>\n",
       "      <th>inNBT</th>\n",
       "      <th>inOncokb</th>\n",
       "      <th>Samples</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>SMARCA4</td>\n",
       "      <td>546</td>\n",
       "      <td>-7.752356</td>\n",
       "      <td>5</td>\n",
       "      <td>QK:5</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>:NA</td>\n",
       "      <td>K546del:5</td>\n",
       "      <td>...</td>\n",
       "      <td>1.295006</td>\n",
       "      <td>1.339653</td>\n",
       "      <td>False</td>\n",
       "      <td>LOCAL_ENTROPY</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>RETAIN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>cnsbrain:4|lymph:1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CDKN2A</td>\n",
       "      <td>27-42</td>\n",
       "      <td>-6.821115</td>\n",
       "      <td>12</td>\n",
       "      <td>VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...</td>\n",
       "      <td>219</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>:NA</td>\n",
       "      <td>V28_E33del:4</td>\n",
       "      <td>...</td>\n",
       "      <td>1.130088</td>\n",
       "      <td>1.157763</td>\n",
       "      <td>False</td>\n",
       "      <td>LOCAL_ENTROPY</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>RETAIN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>cervix:1|esophagusstomach:1|lung:1|pancreas:1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CDKN2A</td>\n",
       "      <td>27-42</td>\n",
       "      <td>-6.821115</td>\n",
       "      <td>12</td>\n",
       "      <td>VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...</td>\n",
       "      <td>219</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>:NA</td>\n",
       "      <td>L32_L37del:3</td>\n",
       "      <td>...</td>\n",
       "      <td>1.130088</td>\n",
       "      <td>1.157763</td>\n",
       "      <td>False</td>\n",
       "      <td>LOCAL_ENTROPY</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>RETAIN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>skin:2|esophagusstomach:1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CDKN2A</td>\n",
       "      <td>27-42</td>\n",
       "      <td>-6.821115</td>\n",
       "      <td>12</td>\n",
       "      <td>VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...</td>\n",
       "      <td>219</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>:NA</td>\n",
       "      <td>A36_N39delinsD:1</td>\n",
       "      <td>...</td>\n",
       "      <td>1.130088</td>\n",
       "      <td>1.157763</td>\n",
       "      <td>False</td>\n",
       "      <td>LOCAL_ENTROPY</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>RETAIN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>lung:1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CDKN2A</td>\n",
       "      <td>27-42</td>\n",
       "      <td>-6.821115</td>\n",
       "      <td>12</td>\n",
       "      <td>VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...</td>\n",
       "      <td>219</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>:NA</td>\n",
       "      <td>E27del:1</td>\n",
       "      <td>...</td>\n",
       "      <td>1.130088</td>\n",
       "      <td>1.157763</td>\n",
       "      <td>False</td>\n",
       "      <td>LOCAL_ENTROPY</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>RETAIN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>lung:1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Hugo_Symbol Amino_Acid_Position  log10_pvalue  Mutation_Count  \\\n",
       "0     SMARCA4                 546     -7.752356               5   \n",
       "1      CDKN2A               27-42     -6.821115              12   \n",
       "2      CDKN2A               27-42     -6.821115              12   \n",
       "3      CDKN2A               27-42     -6.821115              12   \n",
       "4      CDKN2A               27-42     -6.821115              12   \n",
       "\n",
       "                                Reference_Amino_Acid  Total_Mutations_in_Gene  \\\n",
       "0                                               QK:5                      101   \n",
       "1  VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...                      219   \n",
       "2  VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...                      219   \n",
       "3  VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...                      219   \n",
       "4  VRALLEA:4|LEAGALP:3|ALPN:1|EV:1|GA:1|PNAPN:1|R...                      219   \n",
       "\n",
       "   Median_Allele_Freq_Rank Allele_Freq_Rank SNP_ID Variant_Amino_Acid  \\\n",
       "0                      NaN              NaN    :NA          K546del:5   \n",
       "1                      NaN              NaN    :NA       V28_E33del:4   \n",
       "2                      NaN              NaN    :NA       L32_L37del:3   \n",
       "3                      NaN              NaN    :NA   A36_N39delinsD:1   \n",
       "4                      NaN              NaN    :NA           E27del:1   \n",
       "\n",
       "                       ...                       pad24entropy pad36entropy  \\\n",
       "0                      ...                           1.295006     1.339653   \n",
       "1                      ...                           1.130088     1.157763   \n",
       "2                      ...                           1.130088     1.157763   \n",
       "3                      ...                           1.130088     1.157763   \n",
       "4                      ...                           1.130088     1.157763   \n",
       "\n",
       "      TP         reason  n_MSK  n_Retro  judgement  inNBT  inOncokb  \\\n",
       "0  False  LOCAL_ENTROPY      1        4     RETAIN  False     False   \n",
       "1  False  LOCAL_ENTROPY      6        6     RETAIN  False     False   \n",
       "2  False  LOCAL_ENTROPY      6        6     RETAIN  False     False   \n",
       "3  False  LOCAL_ENTROPY      6        6     RETAIN  False     False   \n",
       "4  False  LOCAL_ENTROPY      6        6     RETAIN  False     False   \n",
       "\n",
       "                                         Samples  \n",
       "0                             cnsbrain:4|lymph:1  \n",
       "1  cervix:1|esophagusstomach:1|lung:1|pancreas:1  \n",
       "2                      skin:2|esophagusstomach:1  \n",
       "3                                         lung:1  \n",
       "4                                         lung:1  \n",
       "\n",
       "[5 rows x 37 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hotspots_df_indel.head()\n",
    "# 'Genomic_Position'\n",
    "# '1:115256529_252|1:115256530_143|1:115256528_27'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3004, 39)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hotspots_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "positions = list(hotspots_df['Genomic_Position'].values) + list(hotspots_df_indel['Genomic_Position'].values)\n",
    "# positions = hotspots_df_indel['Genomic_Position'].values\n",
    "\n",
    "positions_list=[]\n",
    "for p in positions:\n",
    "    p_list= p.split('|')\n",
    "    for pp in p_list:\n",
    "        positions_list.append(pp.split('_')[0])\n",
    "\n",
    "positions_list = list(set(positions_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2533"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(positions_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Hugo_Symbol</th>\n",
       "      <th>Entrez_Gene_Id</th>\n",
       "      <th>Center</th>\n",
       "      <th>NCBI_Build</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start_Position</th>\n",
       "      <th>End_position</th>\n",
       "      <th>Strand</th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>Matched_Norm_Sample_Barcode</th>\n",
       "      <th>...</th>\n",
       "      <th>pos</th>\n",
       "      <th>type</th>\n",
       "      <th>classification</th>\n",
       "      <th>ref_allele</th>\n",
       "      <th>patient</th>\n",
       "      <th>Primary_Met</th>\n",
       "      <th>pair_id</th>\n",
       "      <th>individual_id</th>\n",
       "      <th>case_sample</th>\n",
       "      <th>Pair_Set_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1BG</td>\n",
       "      <td>1</td>\n",
       "      <td>broad.mit.edu</td>\n",
       "      <td>37</td>\n",
       "      <td>19</td>\n",
       "      <td>58862934</td>\n",
       "      <td>58862934</td>\n",
       "      <td>+</td>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-Normal-SM-1U3NZ</td>\n",
       "      <td>...</td>\n",
       "      <td>58862934</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>SNP</td>\n",
       "      <td>G</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG</td>\n",
       "      <td>Primary</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-TP-NB-SM-1U3IG-SM-1U3NZ</td>\n",
       "      <td>PRAD-TCGA-EJ-5499</td>\n",
       "      <td>PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG</td>\n",
       "      <td>Prim_762017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A1BG</td>\n",
       "      <td>1</td>\n",
       "      <td>broad.mit.edu</td>\n",
       "      <td>37</td>\n",
       "      <td>19</td>\n",
       "      <td>58863660</td>\n",
       "      <td>58863660</td>\n",
       "      <td>+</td>\n",
       "      <td>MO_1012</td>\n",
       "      <td>MO_1012-Normal</td>\n",
       "      <td>...</td>\n",
       "      <td>58863660</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>SNP</td>\n",
       "      <td>A</td>\n",
       "      <td>MO_1012-Tumor-Abdomen_wall_nodule</td>\n",
       "      <td>Metastasis</td>\n",
       "      <td>MO_1012_TM_NB_MO_1012-Tumor-Abdomen_wall_nodul...</td>\n",
       "      <td>MO_1012</td>\n",
       "      <td>MO_1012-Tumor-Abdomen_wall_nodule</td>\n",
       "      <td>Met_762017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A1BG</td>\n",
       "      <td>1</td>\n",
       "      <td>broad.mit.edu</td>\n",
       "      <td>37</td>\n",
       "      <td>19</td>\n",
       "      <td>58863782</td>\n",
       "      <td>58863782</td>\n",
       "      <td>+</td>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>PRAD-TCGA-CH-5752-Normal-SM-1U3IX</td>\n",
       "      <td>...</td>\n",
       "      <td>58863782</td>\n",
       "      <td>Silent</td>\n",
       "      <td>SNP</td>\n",
       "      <td>C</td>\n",
       "      <td>PRAD-TCGA-CH-5752-Tumor-SM-1U3ID</td>\n",
       "      <td>Primary</td>\n",
       "      <td>PRAD-TCGA-CH-5752-TP-NB-SM-1U3ID-SM-1U3IX</td>\n",
       "      <td>PRAD-TCGA-CH-5752</td>\n",
       "      <td>PRAD-TCGA-CH-5752-Tumor-SM-1U3ID</td>\n",
       "      <td>Prim_762017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A1BG</td>\n",
       "      <td>1</td>\n",
       "      <td>broad.mit.edu</td>\n",
       "      <td>37</td>\n",
       "      <td>19</td>\n",
       "      <td>58864304</td>\n",
       "      <td>58864304</td>\n",
       "      <td>+</td>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>06-134A1_NORMAL</td>\n",
       "      <td>...</td>\n",
       "      <td>58864304</td>\n",
       "      <td>Silent</td>\n",
       "      <td>SNP</td>\n",
       "      <td>C</td>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>Metastasis</td>\n",
       "      <td>06-134H1_LN_06-134A1_NORMAL</td>\n",
       "      <td>06-134</td>\n",
       "      <td>06-134H1_LN</td>\n",
       "      <td>Met_762017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A1CF</td>\n",
       "      <td>29974</td>\n",
       "      <td>broad.mit.edu</td>\n",
       "      <td>37</td>\n",
       "      <td>10</td>\n",
       "      <td>52569681</td>\n",
       "      <td>52569681</td>\n",
       "      <td>+</td>\n",
       "      <td>SC_9126</td>\n",
       "      <td>SC_9126_Normal</td>\n",
       "      <td>...</td>\n",
       "      <td>52569681</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>SNP</td>\n",
       "      <td>C</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>Metastasis</td>\n",
       "      <td>SC_9126_TM_NB_SC_9126_Tumor_SC_9126_Normal</td>\n",
       "      <td>SC_9126</td>\n",
       "      <td>SC_9126_Tumor</td>\n",
       "      <td>Met_762017</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 56 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Hugo_Symbol  Entrez_Gene_Id         Center  NCBI_Build  Chromosome  \\\n",
       "0        A1BG               1  broad.mit.edu          37          19   \n",
       "1        A1BG               1  broad.mit.edu          37          19   \n",
       "2        A1BG               1  broad.mit.edu          37          19   \n",
       "3        A1BG               1  broad.mit.edu          37          19   \n",
       "4        A1CF           29974  broad.mit.edu          37          10   \n",
       "\n",
       "   Start_Position  End_position Strand Tumor_Sample_Barcode  \\\n",
       "0        58862934      58862934      +         TCGA-EJ-5499   \n",
       "1        58863660      58863660      +              MO_1012   \n",
       "2        58863782      58863782      +         TCGA-CH-5752   \n",
       "3        58864304      58864304      +          06-134H1_LN   \n",
       "4        52569681      52569681      +              SC_9126   \n",
       "\n",
       "         Matched_Norm_Sample_Barcode     ...            pos  \\\n",
       "0  PRAD-TCGA-EJ-5499-Normal-SM-1U3NZ     ...       58862934   \n",
       "1                     MO_1012-Normal     ...       58863660   \n",
       "2  PRAD-TCGA-CH-5752-Normal-SM-1U3IX     ...       58863782   \n",
       "3                    06-134A1_NORMAL     ...       58864304   \n",
       "4                     SC_9126_Normal     ...       52569681   \n",
       "\n",
       "                type classification ref_allele  \\\n",
       "0  Missense_Mutation            SNP          G   \n",
       "1  Missense_Mutation            SNP          A   \n",
       "2             Silent            SNP          C   \n",
       "3             Silent            SNP          C   \n",
       "4  Missense_Mutation            SNP          C   \n",
       "\n",
       "                             patient Primary_Met  \\\n",
       "0   PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG     Primary   \n",
       "1  MO_1012-Tumor-Abdomen_wall_nodule  Metastasis   \n",
       "2   PRAD-TCGA-CH-5752-Tumor-SM-1U3ID     Primary   \n",
       "3                        06-134H1_LN  Metastasis   \n",
       "4                      SC_9126_Tumor  Metastasis   \n",
       "\n",
       "                                             pair_id      individual_id  \\\n",
       "0          PRAD-TCGA-EJ-5499-TP-NB-SM-1U3IG-SM-1U3NZ  PRAD-TCGA-EJ-5499   \n",
       "1  MO_1012_TM_NB_MO_1012-Tumor-Abdomen_wall_nodul...            MO_1012   \n",
       "2          PRAD-TCGA-CH-5752-TP-NB-SM-1U3ID-SM-1U3IX  PRAD-TCGA-CH-5752   \n",
       "3                        06-134H1_LN_06-134A1_NORMAL             06-134   \n",
       "4         SC_9126_TM_NB_SC_9126_Tumor_SC_9126_Normal            SC_9126   \n",
       "\n",
       "                         case_sample  Pair_Set_ID  \n",
       "0   PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG  Prim_762017  \n",
       "1  MO_1012-Tumor-Abdomen_wall_nodule   Met_762017  \n",
       "2   PRAD-TCGA-CH-5752-Tumor-SM-1U3ID  Prim_762017  \n",
       "3                        06-134H1_LN   Met_762017  \n",
       "4                      SC_9126_Tumor   Met_762017  \n",
       "\n",
       "[5 rows x 56 columns]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1000_maf_file = join(PROSTATE_DATA_PATH, 'raw_data/41588_2018_78_MOESM4_ESM.txt')\n",
    "mut_df = pd.read_csv(p1000_maf_file, sep='\\t', low_memory=False, skiprows=1)\n",
    "mut_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "mut_df['Genomic_Position'] = mut_df[\"Chromosome\"].astype(str) +':'+ mut_df[\"Start_Position\"].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "ind = mut_df['Genomic_Position'].isin(positions_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "459"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(ind)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "mut_df['hotspot_indicator'] = ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1012, 15741)"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get all genes and samples with only \n",
    "mut_df_hotspot_matrix_corrected = pd.pivot_table(data=mut_df, index='Tumor_Sample_Barcode', columns='Hugo_Symbol', values='hotspot_indicator', aggfunc='sum')\n",
    "mut_df_hotspot_matrix_corrected.shape\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Hugo_Symbol</th>\n",
       "      <th>A1BG</th>\n",
       "      <th>A1CF</th>\n",
       "      <th>A2M</th>\n",
       "      <th>A2ML1</th>\n",
       "      <th>A4GALT</th>\n",
       "      <th>A4GNT</th>\n",
       "      <th>AAAS</th>\n",
       "      <th>AACS</th>\n",
       "      <th>AADAC</th>\n",
       "      <th>AADACL2</th>\n",
       "      <th>...</th>\n",
       "      <th>ZW10</th>\n",
       "      <th>ZWILCH</th>\n",
       "      <th>ZWINT</th>\n",
       "      <th>ZXDA</th>\n",
       "      <th>ZXDB</th>\n",
       "      <th>ZXDC</th>\n",
       "      <th>ZYG11B</th>\n",
       "      <th>ZYX</th>\n",
       "      <th>ZZEF1</th>\n",
       "      <th>ZZZ3</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>00-029N9_LN</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>01-087MM_BONE</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>01-095N1_LN</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>01-120A1_LIVER</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>02-083E1_LN</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 15741 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Hugo_Symbol           A1BG  A1CF  A2M  A2ML1  A4GALT  A4GNT  AAAS  AACS  \\\n",
       "Tumor_Sample_Barcode                                                      \n",
       "00-029N9_LN            0.0   0.0  0.0    0.0     0.0    0.0   0.0   0.0   \n",
       "01-087MM_BONE          0.0   0.0  0.0    0.0     0.0    0.0   0.0   0.0   \n",
       "01-095N1_LN            0.0   0.0  0.0    0.0     0.0    0.0   0.0   0.0   \n",
       "01-120A1_LIVER         0.0   0.0  0.0    0.0     0.0    0.0   0.0   0.0   \n",
       "02-083E1_LN            0.0   0.0  0.0    0.0     0.0    0.0   0.0   0.0   \n",
       "\n",
       "Hugo_Symbol           AADAC  AADACL2  ...   ZW10  ZWILCH  ZWINT  ZXDA  ZXDB  \\\n",
       "Tumor_Sample_Barcode                  ...                                     \n",
       "00-029N9_LN             0.0      0.0  ...    0.0     0.0    0.0   0.0   0.0   \n",
       "01-087MM_BONE           0.0      0.0  ...    0.0     0.0    0.0   0.0   0.0   \n",
       "01-095N1_LN             0.0      0.0  ...    0.0     0.0    0.0   0.0   0.0   \n",
       "01-120A1_LIVER          0.0      0.0  ...    0.0     0.0    0.0   0.0   0.0   \n",
       "02-083E1_LN             0.0      0.0  ...    0.0     0.0    0.0   0.0   0.0   \n",
       "\n",
       "Hugo_Symbol           ZXDC  ZYG11B  ZYX  ZZEF1  ZZZ3  \n",
       "Tumor_Sample_Barcode                                  \n",
       "00-029N9_LN            0.0     0.0  0.0    0.0   0.0  \n",
       "01-087MM_BONE          0.0     0.0  0.0    0.0   0.0  \n",
       "01-095N1_LN            0.0     0.0  0.0    0.0   0.0  \n",
       "01-120A1_LIVER         0.0     0.0  0.0    0.0   0.0  \n",
       "02-083E1_LN            0.0     0.0  0.0    0.0   0.0  \n",
       "\n",
       "[5 rows x 15741 columns]"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mut_df_hotspot_matrix_corrected.fillna(0., inplace=True)\n",
    "mut_df_hotspot_matrix_corrected.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "hotspot_n = mut_df_hotspot_matrix_corrected.sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = '/Users/haithamelmarakeby/PycharmProjects/pnet2/_database/prostate/processed/P1000_final_analysis_set_cross_important_only.csv'\n",
    "p1000_mut_matrix = pd.read_csv(filename, index_col=0)\n",
    "p1000_mut_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.007532616722737343"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mut_n=p1000_mut_matrix.sum().sum()\n",
    "hotspot_n/mut_n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "dd = p1000_mut_matrix + mut_df_hotspot_matrix_corrected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "60935.0"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1000_mut_matrix.sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61394.0"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dd.sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "459.0"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mut_df_hotspot_matrix_corrected.sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename=join(PROSTATE_DATA_PATH, 'processed/P1000_final_analysis_set_cross_important_only_plus_hotspots.csv')\n",
    "dd.to_csv(filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## limit samples and genes that have nonzero hotspot mutations "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "mut_df_hotspot = mut_df[ind]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Primary       252\n",
       "Metastasis    207\n",
       "Name: Primary_Met, dtype: int64"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "mut_df_hotspot['Primary_Met'].value_counts()\n",
    "\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "364"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(mut_df_hotspot['Tumor_Sample_Barcode'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "mut_df_hotspot.to_csv('P1000_hotspot_mutations_maf.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "54"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(mut_df_hotspot['Hugo_Symbol'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "mut_df_hotspot_matrix = pd.pivot_table(data=mut_df_hotspot, index='Tumor_Sample_Barcode', columns='Hugo_Symbol', values='Variant_Classification', aggfunc='count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(364, 54)"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mut_df_hotspot_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "459.0"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mut_df_hotspot_matrix.fillna(0., inplace=True)\n",
    "mut_df_hotspot_matrix.sum().sum()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "mut_df_hotspot_matrix.to_csv(join(PROSTATE_DATA_PATH,'processed/P1000_final_analysis_set_cross_hotspots.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:min_env]",
   "language": "python",
   "name": "conda-env-min_env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
